knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(rlang)
library(viridis)
theme_set(theme_minimal() + theme(legend.position = "right"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
# Load and wrangle the overall data.
diabetes_df = read_csv("data/cleaned_diabetes_data.csv")
# Binary helper function
recode_factor = function(var) {
factor(var, levels = c("No", "Yes"))
}
# Clean the diabetes variables
diabetes_df =
diabetes_df |>
mutate(
has_diabetes = factor(has_diabetes, levels = c("Not diabetic", "Pre-diabetic", "Diabetic", "Gestational")),
sex_at_birth = fct_infreq(sex_at_birth),
age_category = factor(age_category, levels = c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+"), order = TRUE),
pregnant = recode_factor(pregnant),
eval_type = factor(eval_type, levels = c("Not diabetic", "Type 1", "Type 2")),
diab_type = factor(diab_type, levels = c(1, 2), labels = c("Type 1", "Type 2"))
)
There are three variables that we are interested in understanding –
has_diabetes, diab_type, and
eval_type.
has_diabetes: This variable describes
the people in the survey who responded after being asked the question,
“Have you ever been told you had diabetes?” Overall, 432,339 people
responded to this question.
diab_type: This variable comes directly
from the dataset, where respondents who were diabetic per the
has_diabetes question were asked “What type of diabetes do
you have?” Only 22,027 participants answered this question out of the
59,786 people who responded that they had diabetes. Since there are over
60% of respondents unaccounted for, this measure may not tell us enough
about risk factors or comorbidities of T2D in the general
population.
eval_type: Due to the missing data for
diab_type, we created another variable
eval_type that describes the person’s diabetes type by
using other demographic information from this dataset. We used this paper
published by the CDC that used the same BRFSS survey from 2014 to
classify T2D diagnosis. In this paper, they classified a survey
respondent to have type II diabetes if the respondent was older than 30,
not pregnant, and answered yes to the question “Have you ever been told
you have diabetes?”
We then classified a respondent to have type II diabetes if the
respondent was older than 30, not pregnant, and were diabetic as per the
has_diabetes question. We classified a respondent to have
type I diabetes if the respondent was younger than 30 and were diabetic
as per the has_diabetes question.
has_diabetes)diabetes_dist = function(df, var) {
df |>
group_by({{ var }}) |>
summarize(count = n()) |>
knitr::kable()
}
diabetes_plot = function(df, var) {
diabetes_df |>
ggplot(aes(x = {{ var }})) +
geom_bar(aes(fill = factor({{ var }})), color = "black", na.rm = FALSE) +
geom_text(
stat = "count", # Use the count statistic for frequencies
aes(label = ..count..), # Access the count directly
vjust = -0.5, # Adjust position of labels above the bars
na.rm = FALSE
) +
labs(
title = "Distribution of Diabetes",
x = "Diabetes Status",
y = "Count",
fill = as.character(rlang::ensym(var))
) +
theme_minimal() +
theme(legend.position="bottom", legend.title = element_blank())
}
diabetes_dist(df = diabetes_df, var = has_diabetes)
| has_diabetes | count |
|---|---|
| Not diabetic | 358706 |
| Pre-diabetic | 10594 |
| Diabetic | 59786 |
| Gestational | 3253 |
| NA | 984 |
diabetes_plot(df = diabetes_df, var = has_diabetes)
diab_type)diabetes_dist(df = diabetes_df, var = diab_type)
| diab_type | count |
|---|---|
| Type 1 | 1958 |
| Type 2 | 20069 |
| NA | 411296 |
diabetes_plot(df = diabetes_df, var = diab_type)
eval_type)diabetes_dist(df = diabetes_df, var = eval_type)
| eval_type | count |
|---|---|
| Not diabetic | 372797 |
| Type 1 | 664 |
| Type 2 | 59007 |
| NA | 855 |
diabetes_plot(df = diabetes_df, var = eval_type)
diabetes_demographics =
diabetes_df |>
mutate(
good_health = recode_factor(good_health),
physical_activity = factor(
physical_activity,
levels = c(
"Highly active",
"Active",
"Insufficiently active",
"Inactive"
)
),
urb_rural = factor(urb_rural, levels = c("Urban", "Rural")),
race = as.factor(race),
income = factor(
income,
levels = c(
"Less than 15K",
"15K=25K",
"25-35K",
"35K-50K",
"50K-100K",
"100K-200K",
"More than 200K"
)
),
education = factor(education,
levels = c(
"Kindergarten or less",
"Elementary",
"Some high school",
"High school graduate",
"Some college or technical school",
"College graduate"
)
)
) |>
select(has_diabetes, diab_type, eval_type, good_health, physical_activity, urb_rural, race, income, education) |>
rename(
`Has Diabetes` = has_diabetes,
`Reported Type` = diab_type,
`Evaluated Type` = eval_type,
`In Good Health` = good_health,
`Level of Physical Activity` = physical_activity,
`Urban/Rural Status` = urb_rural,
`Race/Ethnicity` = race,
`Income Level` = income,
`Educational Attainment` = education
)
This section explores the distribution of demographic information among diabetes diagnoses.
# Define a function for the plot
comorbidities_plot = function(df, comorbidity, diabetes) {
# Convert character strings to symbols
comorbidity_sym <- rlang::sym(comorbidity)
diabetes_sym <- rlang::sym(diabetes)
df |>
filter(!is.na(!!comorbidity_sym), !(is.na(!!diabetes_sym))) |>
group_by(!!comorbidity_sym, !!diabetes_sym) |>
summarize(n = n(), .groups = 'drop') %>%
group_by(!!diabetes_sym) %>%
mutate(Percent = n / sum(n) * 100) |>
ggplot(aes(x = !!diabetes_sym, y = Percent, fill = !!comorbidity_sym)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = sprintf("%.1f", Percent)),
position = position_dodge(width = 0.9),
size = 2.5,
vjust = -0.5) +
labs(
title = str_c(diabetes, " by ", comorbidity),
x = diabetes,
y = "Percent (%)",
fill = comorbidity) +
scale_fill_viridis_d(na.value = "gray50", option = "D") +
theme_minimal() +
theme(legend.position="bottom")
}
demo = "In Good Health"
has_diabetescomorbidities_plot(diabetes_demographics, demo, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_demographics, demo, "Reported Type")
eval_typecomorbidities_plot(diabetes_demographics, demo, "Evaluated Type")
demo = "Level of Physical Activity"
has_diabetescomorbidities_plot(diabetes_demographics, demo, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_demographics, demo, "Reported Type")
eval_typecomorbidities_plot(diabetes_demographics, demo, "Evaluated Type")
demo = "Urban/Rural Status"
has_diabetescomorbidities_plot(diabetes_demographics, demo, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_demographics, demo, "Reported Type")
eval_typecomorbidities_plot(diabetes_demographics, demo, "Evaluated Type")
demo = "Race/Ethnicity"
has_diabetescomorbidities_plot(diabetes_demographics, demo, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_demographics, demo, "Reported Type")
eval_typecomorbidities_plot(diabetes_demographics, demo, "Evaluated Type")
demo = "Income Level"
has_diabetescomorbidities_plot(diabetes_demographics, demo, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_demographics, demo, "Reported Type")
eval_typecomorbidities_plot(diabetes_demographics, demo, "Evaluated Type")
demo = "Educational Attainment"
has_diabetescomorbidities_plot(diabetes_demographics, demo, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_demographics, demo, "Reported Type")
eval_typecomorbidities_plot(diabetes_demographics, demo, "Evaluated Type")
diabetes_comorbidities =
diabetes_df |>
mutate(
kidney_disease = recode_factor(kidney_disease),
depression = recode_factor(depression),
arthritis = recode_factor(arthritis),
michd = recode_factor(michd),
blind = recode_factor(blind),
covid = recode_factor(covid),
stroke = recode_factor(stroke),
bronchitis = recode_factor(bronchitis),
asthma_ever = recode_factor(asthma_ever),
asthma_now = recode_factor(asthma_now)
) |>
select(has_diabetes, diab_type, eval_type, kidney_disease, arthritis, depression, michd, blind, covid, stroke, bronchitis, asthma_ever, asthma_now) |>
rename(
`History of Kidney Disease` = kidney_disease,
`History of MI/CHD` = michd,
`History of Depression` = depression,
`Blindness` = blind,
`History of Stroke` = stroke,
`History of Arthritis` = arthritis,
`History of Asthma` = asthma_ever,
`Currently Have Asthma` = asthma_now,
`Ever Tested Positive for Covid-19` = covid,
`History of Bronchitis` = bronchitis,
`Has Diabetes` = has_diabetes,
`Reported Type` = diab_type,
`Evaluated Type` = eval_type
)
This section explores the distribution of co-morbidities across diabetes diagnoses.
comorbidity = "History of Kidney Disease"
has_diabetescomorbidities_plot(diabetes_comorbidities, comorbidity, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Evaluated Type")
comorbidity = "History of Depression"
has_diabetescomorbidities_plot(diabetes_comorbidities, comorbidity, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Evaluated Type")
comorbidity = "Blindness"
has_diabetescomorbidities_plot(diabetes_comorbidities, comorbidity, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Evaluated Type")
comorbidity = "History of MI/CHD"
has_diabetescomorbidities_plot(diabetes_comorbidities, comorbidity, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Evaluated Type")
comorbidity = "History of Stroke"
has_diabetescomorbidities_plot(diabetes_comorbidities, comorbidity, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Evaluated Type")
comorbidity = "History of Arthritis"
has_diabetescomorbidities_plot(diabetes_comorbidities, comorbidity, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Evaluated Type")
has_diabetescomorbidities_plot(diabetes_comorbidities, "History of Asthma", "Has Diabetes")
comorbidities_plot(diabetes_comorbidities, "Currently Have Asthma", "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, "History of Asthma", "Reported Type")
comorbidities_plot(diabetes_comorbidities, "Currently Have Asthma", "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, "History of Asthma", "Evaluated Type")
comorbidities_plot(diabetes_comorbidities, "Currently Have Asthma", "Evaluated Type")
comorbidity = "Ever Tested Positive for Covid-19"
has_diabetescomorbidities_plot(diabetes_comorbidities, comorbidity, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Evaluated Type")
comorbidity = "History of Bronchitis"
has_diabetescomorbidities_plot(diabetes_comorbidities, comorbidity, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Reported Type")
eval_typecomorbidities_plot(diabetes_comorbidities, comorbidity, "Evaluated Type")
diabetes_factors =
diabetes_df |>
mutate(
high_bp = recode_factor(high_bp),
a1c_check = case_match(a1c_check, 0 ~ "No", 1 ~ "Yes", NA ~ NA),
a1c_check = recode_factor(a1c_check),
chol_meds = recode_factor(chol_meds),
chol_check = recode_factor(chol_check),
smoker = factor(smoker, levels = c("Current smoker (every day)", "Current smoker (some days)", "Former smoker", "Never smoked")),
ecigs = recode_factor(ecigs),
prediabetic = factor(prediabetic, levels = c("Yes", "No", "Gestational")),
obese = recode_factor(obese),
cancer = recode_factor(cancer),
heavy_drinking = recode_factor(heavy_drinking),
binge_drinking = recode_factor(binge_drinking)
) |>
select(high_bp, a1c_check, chol_meds, chol_check, smoker, ecigs, prediabetic, obese, cancer, heavy_drinking, binge_drinking, has_diabetes, diab_type, eval_type) |>
rename(
`High Blood Pressure` = high_bp,
`High Cholesterol` = chol_check,
`Meds Usage for Cholesterol` = chol_meds,
`Checked A1C in the Past Year` = a1c_check,
`Smoking Status` = smoker,
`E-Cigarette Use` = ecigs,
`Prediabetes Diagnosis` = prediabetic,
`Overweight/Obese (Computed)` = obese,
`Cancer Diagnosis` = cancer,
`Heavy Drinking` = heavy_drinking,
`Binge Drinking` = binge_drinking,
`Has Diabetes` = has_diabetes,
`Reported Type` = diab_type,
`Evaluated Type` = eval_type
)
This section explores the distribution of related factors across diabetes diagnoses.
factor = "High Blood Pressure"
has_diabetescomorbidities_plot(diabetes_factors, factor, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_factors, factor, "Reported Type")
eval_typecomorbidities_plot(diabetes_factors, factor, "Evaluated Type")
has_diabetescomorbidities_plot(diabetes_factors, "High Cholesterol", "Has Diabetes")
comorbidities_plot(diabetes_factors, "Meds Usage for Cholesterol", "Has Diabetes")
diab_typecomorbidities_plot(diabetes_factors, "High Cholesterol", "Reported Type")
comorbidities_plot(diabetes_factors, "Meds Usage for Cholesterol", "Reported Type")
eval_typecomorbidities_plot(diabetes_factors, "High Cholesterol", "Evaluated Type")
comorbidities_plot(diabetes_factors, "Meds Usage for Cholesterol", "Evaluated Type")
has_diabetescomorbidities_plot(diabetes_factors, "Smoking Status", "Has Diabetes")
comorbidities_plot(diabetes_factors, "E-Cigarette Use", "Has Diabetes")
diab_typecomorbidities_plot(diabetes_factors, "Smoking Status", "Reported Type")
comorbidities_plot(diabetes_factors, "E-Cigarette Use", "Reported Type")
eval_typecomorbidities_plot(diabetes_factors, "Smoking Status", "Evaluated Type")
comorbidities_plot(diabetes_factors, "E-Cigarette Use", "Evaluated Type")
factor = "Overweight/Obese (Computed)"
has_diabetescomorbidities_plot(diabetes_factors, factor, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_factors, factor, "Reported Type")
eval_typecomorbidities_plot(diabetes_factors, factor, "Evaluated Type")
factor = "Cancer Diagnosis"
has_diabetescomorbidities_plot(diabetes_factors, factor, "Has Diabetes")
diab_typecomorbidities_plot(diabetes_factors, factor, "Reported Type")
eval_typecomorbidities_plot(diabetes_factors, factor, "Evaluated Type")
has_diabetescomorbidities_plot(diabetes_factors, "Heavy Drinking", "Has Diabetes")
comorbidities_plot(diabetes_factors, "Binge Drinking", "Has Diabetes")
diab_typecomorbidities_plot(diabetes_factors, "Heavy Drinking", "Reported Type")
comorbidities_plot(diabetes_factors, "Binge Drinking", "Reported Type")
eval_typecomorbidities_plot(diabetes_factors, "Heavy Drinking", "Evaluated Type")
comorbidities_plot(diabetes_factors, "Binge Drinking", "Evaluated Type")